{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import gc\n",
    "import warnings\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "from tqdm import tqdm\n",
    "from multiprocessing import Pool as ProcessPool\n",
    "\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weekList(record, mode='%A'): # %w\n",
    "    weeks = ' '.join([datetime(int(s[:4]),int(s[4:6]),int(s[6:8])).strftime(mode)  for s in record])\n",
    "    return weeks\n",
    "\n",
    "def monthList(record):\n",
    "    months = ' '.join([s[4:6] for s in record])\n",
    "    return months\n",
    "\n",
    "def hourList(record):\n",
    "    hours = ' '.join([x[9:].replace('|',' ') for x in record])\n",
    "    return hours\n",
    "\n",
    "def load_test(file_name):\n",
    "    df = pd.read_csv('./test_visit/test_visit/'+file_name, '\\t', names=['user_id', 'record'])\n",
    "    df['record_split'] = df.record.apply(lambda x: x.split(','))\n",
    "    return df \n",
    "def load_train(file_name):\n",
    "    df = pd.read_csv('./train_visit/'+file_name, '\\t', names=['user_id', 'record'])\n",
    "    df['record_split'] = df.record.apply(lambda x: x.split(','))\n",
    "    return df "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_wmh(dataframe):\n",
    "    ws =' '.join(dataframe.record_split.apply(weekList).tolist())\n",
    "    ms =' '.join(dataframe.record_split.apply(monthList).tolist())\n",
    "    hs = ' '.join(dataframe.record_split.apply(hourList).tolist())\n",
    "    return ws, ms, hs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = pd.read_csv('./test_visit/test_visit/000000.txt','\\t',names=['user_id','record'])\n",
    "temp['record_split'] = temp.record.apply(lambda x: x.split(','))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.49 ms, sys: 0 ns, total: 8.49 ms\n",
      "Wall time: 8.27 ms\n"
     ]
    }
   ],
   "source": [
    "%time a,b,c =get_wmh(temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "#%time train_visit = visit_preprocess('./train_visit/', 'train', 20)  # \n",
    "#%time test_visit = visit_preprocess('./test_visit/test_visit/', 'test', 40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "def visit_preprocess(path, trainortest='train'):\n",
    "    sample_names = sorted(os.listdir(path))\n",
    "    if trainortest == 'train':\n",
    "        ids = [x.split('.')[0].split('_')[0] for x in sample_names]\n",
    "        labels = [int(x.split('.')[0].split('_')[1]) for x in sample_names]\n",
    "    if trainortest == 'test':\n",
    "        ids = [x.split('.')[0] for x in sample_names]\n",
    "    df = pd.DataFrame({'AreaID':ids})\n",
    "    # load dataframe\n",
    "    # 40 个进程\n",
    "    if trainortest == 'train':\n",
    "        pool = ProcessPool(20)\n",
    "        records = pool.map(load_train, sample_names)\n",
    "        pool.close()\n",
    "        pool.join()\n",
    "    else :\n",
    "        pool = ProcessPool(40)\n",
    "        records = pool.map(load_test, sample_names)\n",
    "        pool.close()\n",
    "        pool.join()\n",
    "    del pool\n",
    "    gc.collect()\n",
    "    Passenger_flow = [df.shape[0] for df in records]\n",
    "    df['passenger_flow'] = Passenger_flow\n",
    "    #   Parallel\n",
    "    weeks, months, hours = [],[],[]\n",
    "    for record in tqdm(records):\n",
    "        ws,ms,hs = get_wmh(record)\n",
    "        weeks.append(ws)\n",
    "        months.append(ms)\n",
    "        hours.append(hs)\n",
    "    #wmhs = [get_wmh(record) for record in records]\n",
    "    #df['weeks'] = [temp[0] for temp in wmhs]\n",
    "    #df['months'] = [temp[1] for temp in wmhs]\n",
    "    #df['hours'] = [temp[2] for temp in wmhs]\n",
    "    df['weeks'] = weeks\n",
    "    df['months'] = months\n",
    "    df['hours'] = hours\n",
    "    if trainortest == 'train':\n",
    "        df['CategoryID'] = labels\n",
    "    if trainortest == 'test':\n",
    "        df['CategoryID'] = -1\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 40000/40000 [31:33<00:00, 21.13it/s]  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 35min 47s, sys: 2min 50s, total: 38min 37s\n",
      "Wall time: 39min 1s\n"
     ]
    }
   ],
   "source": [
    "%time train_visit = visit_preprocess('./train_visit/', 'train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10000/10000 [07:46<00:00, 21.46it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8min 44s, sys: 39.4 s, total: 9min 23s\n",
      "Wall time: 9min 24s\n"
     ]
    }
   ],
   "source": [
    "%time test_visit = visit_preprocess('./test_visit/test_visit/', trainortest='test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_visit.to_csv('./train_visit.csv', index=False)\n",
    "test_visit.to_csv('./test_visit.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
